dat <- readstata13::read.dta13('applications/dataverse_files/comment_analysis.dta')

dat_wide <- dat %>% group_by(id) %>% 
  mutate(comment_mean = mean(comment, na.rm = TRUE), 
         homeowner_mean = mean(homeowner, na.rm = TRUE),
         comment_change = comment -  comment_mean, 
         homeowner_change = homeowner - homeowner_mean) %>% 
  pivot_wider(id_cols = id, names_from = year, 
              values_from = c(comment_change, homeowner_change))

data <- dat_wide

coefFn <- function(data, w, form, coef){
  library(tidyverse)
  data$w <- w
  
  homeowner <- data %>% pivot_longer(cols = starts_with('home'), 
                        names_to = 'year', values_to = 'treat') %>%
    mutate(year = as.numeric(substr(year, 18, 21)))
  
  comment <- data[, c(1:23)] %>% pivot_longer(cols = starts_with('comment'), 
                                           names_to = 'year', values_to = 'outcome') %>%
    mutate(year = as.numeric(substr(year, 16, 19)))
  
  dat <- left_join(homeowner, comment, by = c('id', 'year'))
  
  coef(lm(form, data = dat, w = w))[coef]
  
}

set.seed(02138)

form <- formula(outcome ~ treat + factor(year))
coef <- 'treat'

#  Private quantile estimation 
n = nrow(dat_wide)
P = 300
target_quantile = 0.6
epsilon_quantile = 0.2

dat_list <- splitData(N = n, n = n / P, P = P)

partition_estimate <- c()

for (i in 1:length(dat_list)) {
  partition_estimate[i] <- coefFn(data[dat_list[[i]], ], w = 1, form = form, coef = coef)
}

lambda <- estimateQuantile(target_quantile = target_quantile, rng = c(0, 10), epsilon = epsilon_quantile, data = partition_estimate)

#

dp_estimates <- suppressWarnings(algorithmUDP(data = dat_wide, statistic = coefFn, B = 1, n = nrow(dat_wide)/300,
              P = 300, lambda = lambda, lambda_var = 0.005, delta = 1/5978,
             epsilon = 0.5, epsilon_alpha = 0.5, parallelize = T, censoring_cutoff = 0.9,
             bias_cutoff = 0.1, disjoint = T, form = form, coef = coef))


homeowner <- data %>% pivot_longer(cols = starts_with('home'), 
                                   names_to = 'year', values_to = 'treat') %>%
  mutate(year = as.numeric(substr(year, 18, 21)))

comment <- data[, c(1:23)] %>% pivot_longer(cols = starts_with('comment'), 
                                            names_to = 'year', values_to = 'outcome') %>%
  mutate(year = as.numeric(substr(year, 16, 19)))

dat <- left_join(homeowner, comment, by = c('id', 'year'))

private_model <- lm(form, data = dat)


# Plot 
private_se <- summary(private_model)$coef[2, 2]
private_point <- summary(private_model)$coef[2, 1]

plot_homeowner <- ggplot() + 
  geom_point(aes(y = private_point, x = 'Original'), col = 'navy') +
  geom_errorbar(aes(ymin = private_point - qnorm(0.975)*private_se, 
                    ymax = private_point + qnorm(0.975)*private_se, x = "Original"), col = 'navy', width = 0) + 
  geom_point(aes(y = dp_estimates$theta_tilde, x = 'Privatized'), col = 'dodgerblue') +
  geom_errorbar(aes(ymin = dp_estimates$theta_tilde - qnorm(0.975)*sqrt(dp_estimates$var_est), 
                    ymax = dp_estimates$theta_tilde + qnorm(0.975)*sqrt(dp_estimates$var_est), 
                    x = "Privatized"), col = 'dodgerblue', width = 0) +
    theme_bw() + 
  geom_point(aes(y = dp_estimates$theta_hat, x = 'Privatized\n(no correction)'), col = 'orange3', shape = 18, size = 4) +
  ylim(c(0, 0.1)) + 
  scale_x_discrete(limits = c('Original', 'Privatized', 'Privatized\n(no correction)')) + 
  theme(panel.grid.major = element_blank(),  legend.position = 'none')  + 
  labs( y = 'Effect of homeownership', x = '')

ggsave(plot_homeowner, file = 'plots/fig5a.pdf', width = 3, height = 3)



# Varying Lambda 

#set.seed(02138)

lambdas <- c(0.04, 0.07, 0.1, 0.15, 0.3, 0.5)
nsims <- 500
estimates_lambdas <- lapply(lambdas, function(i){
  
  sims <- sapply(1:nsims, function(j) {
    
    tryCatch({
      ests <-  algorithmUDP(data = dat_wide, statistic = coefFn, B = 1, n = nrow(dat_wide)/300,
                            P = 300, lambda = i, lambda_var = 0.005, delta = 1/nrow(dat_wide),
                            epsilon = 0.5, epsilon_alpha = 0.5, parallelize = T, censoring_cutoff = 0.9,
                            bias_cutoff = 0.1, disjoint = T, form = form, coef = coef)
      
      dp_estimates <- as.numeric(ests$theta_tilde)  #, var_est = ests$var_est)
      
      #print(dp_estimates)
      }, error = function(e) {
        dp_estimates <- NA #, var_est = NA)
      })

    
    return(dp_estimates)
    
    })
  
    return(sims)
                      
})



est_df <- as.data.frame(do.call(rbind, lapply(1:length(lambdas), function(i) cbind(estimates_lambdas[[i]], lambdas[i]))))
                        
colnames(est_df) <- c('est', 'lambda')                       

est_df$est <- unlist(est_df$est)
est_df$lambda <- unlist(est_df$lambda)

plot_lambda <- ggplot(est_df) + 
  geom_boxplot(aes(x = as.factor(lambda), y = est), width = 0.25) +
labs(x = latex2exp::TeX('$\\Lambda$'), y = 'Estimated effect of homeownership') + 
  theme_bw() + 
  theme(panel.grid.major = element_blank(),  legend.position = 'none') 

ggsave(plot_lambda, file = 'plots/fig6.pdf', width = 5, height = 3.5)


  